library(beeswarm)
library(naniar)
# install.packages("janitor")
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
library(dplyr)
# install.packages("GGally")
library(sets)
Attaching package: ‘sets’
The following object is masked from ‘package:janitor’:
%>%
The following object is masked from ‘package:naniar’:
%>%
The following object is masked from ‘package:dplyr’:
%>%
library(tidyverse)
── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ tibble 3.0.6 ✓ purrr 0.3.4
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x forcats::%>%() masks stringr::%>%(), purrr::%>%(), tidyr::%>%(), tibble::%>%(), sets::%>%(), janitor::%>%(), naniar::%>%(), dplyr::%>%()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x purrr::map() masks maps::map()
library(ggplot2)
library(GGally) # for ggpairs
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
Attaching package: ‘GGally’
The following object is masked from ‘package:sets’:
%>%
# install.packages("maps")
library(maps)
# devtools::install_github(“UrbanInstitute/urbnmapr”)
load_file <- function(file_path){
read.csv(file_path)
}
Need to add some things here about covid.
# Load some of the data
tx_data <- load_file("./../data/COVID-19_cases_TX.csv")
global_mobility_report <- load_file("./../data/Global_Mobility_Report.csv")
cases_plus_census <- load_file("./../data/COVID-19_cases_plus_census.csv")
There is a large amount of data provided to us for this project. We are given three different csv files:
All of these files contain important features; however, due to the large amount of data given we will focus on a few of the variables (features) that we believe are critically important. First, lets narrow our focus down to the following variables: ____
head(global_mobility_report)
cols_keep <- c("county_fips_code", "confirmed_cases", "deaths", "median_income", "male_pop", "female_pop", "total_pop", "median_age", "worked_at_home")
# print(cols_keep)
subset_census <- cases_plus_census[cols_keep]
head(subset_census)
tx_missing <- tx_data[cols_with_missing(tx_data)]
global_mobility_report_missing <- global_mobility_report[cols_with_missing(global_mobility_report)]
cases_plus_census_missing <- cases_plus_census[cols_with_missing(cases_plus_census)]
cols_keep <- c("date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline")
# print(cols_keep)
subset_mobility <- global_mobility_report[cols_keep]
head(subset_mobility)
# subset_mobility[['date'] <- as.Date(subset_mobility['date']], format='%m/%d/%y')
# head(subset_mobility)
subset_mobility$date <- as.Date(subset_mobility$date, format="%Y-%m-%d")
# subset_mobility["date"] <- apply(subset_mobility["date"], 2, as.Date, format = '%m/%d/%y')
head(subset_mobility)
sapply(subset_mobility, min, na.rm = TRUE)
summarise(min = min(subset_mobility, na.rm = TRUE),
max = max(subset_mobility, na.rm = TRUE))
print(min(subset_mobility$date, na.rm=TRUE))
print(max(subset_mobility$date, na.rm=TRUE))
cols_with_missing <- function(frame){
colnames(frame)[ apply(frame, 2, anyNA) ]
}
tx_missing <- tx_data[cols_with_missing(tx_data)]
global_mobility_report_missing <- global_mobility_report[cols_with_missing(global_mobility_report)]
cases_plus_census_missing <- cases_plus_census[cols_with_missing(cases_plus_census)]
class(tx_data)
# Track the columns to see if we are removing any
mobility_cols_orig <- colnames(global_mobility_report)
census_cols_orig <- colnames(cases_plus_census)
tx_county_cols_orig <- colnames(tx_county_info)
tx_statewide_cols_orig <- colnames(tx_statewide_info)
# Remove any columns that do not contain any data
global_mobility_report <- remove_empty(global_mobility_report)
value for "which" not specified, defaulting to c("rows", "cols")
cases_plus_census <- remove_empty(cases_plus_census)
value for "which" not specified, defaulting to c("rows", "cols")
tx_county_info <- remove_empty(tx_county_info)
value for "which" not specified, defaulting to c("rows", "cols")
tx_statewide_info <- remove_empty(tx_statewide_info)
value for "which" not specified, defaulting to c("rows", "cols")
# Remove columns that just have a constant value
global_mobility_report <- remove_constant(global_mobility_report)
cases_plus_census <- remove_constant(cases_plus_census)
tx_county_info <- remove_constant(tx_county_info)
tx_statewide_info <- remove_constant(tx_statewide_info)
mobility_cols_new <- colnames(global_mobility_report)
census_cols_new <- colnames(cases_plus_census)
tx_county_cols_new <- colnames(tx_county_info)
tx_statewide_cols_new <- colnames(tx_statewide_info)
First, we noticed that much of our data within the COVID-19_cases_TX file had many rows of data that are statewide and not just for a specific county. Let’s modify the data frame to be texas_county_info and texas_statewide_info.
# Get sets for columns
print(as.set(mobility_cols_orig) - as.set(mobility_cols_new))
{}
print(as.set(census_cols_orig) - as.set(census_cols_new))
{"date", "do_date", "pop_15_and_over", "pop_5_years_over", "pop_divorced", "pop_never_married",
"pop_now_married", "pop_separated", "pop_widowed", "speak_only_english_at_home",
"speak_spanish_at_home", "speak_spanish_at_home_low_english"}
print(as.set(tx_county_cols_orig) - as.set(tx_county_cols_new))
{"state", "state_fips_code"}
print(as.set(tx_county_cols_orig) - as.set(tx_statewide_cols_new))
{"county_fips_code", "county_name", "state", "state_fips_code"}
head(tx_county_info)
head(tx_statewide_info)
Remove columns from the dataframes that do not have any data, have the same data repeated (not useful since implicit knowledge)
# Track the columns to see if we are removing any
mobility_cols_orig <- colnames(global_mobility_report)
census_cols_orig <- colnames(cases_plus_census)
tx_county_cols_orig <- colnames(tx_county_info)
tx_statewide_cols_orig <- colnames(tx_statewide_info)
# Remove any columns that do not contain any data
global_mobility_report <- remove_empty(global_mobility_report)
cases_plus_census <- remove_empty(cases_plus_census)
tx_county_info <- remove_empty(tx_county_info)
tx_statewide_info <- remove_empty(tx_statewide_info)
# Remove columns that just have a constant value
global_mobility_report <- remove_constant(global_mobility_report)
cases_plus_census <- remove_constant(cases_plus_census)
tx_county_info <- remove_constant(tx_county_info)
tx_statewide_info <- remove_constant(tx_statewide_info)
mobility_cols_new <- colnames(global_mobility_report)
census_cols_new <- colnames(cases_plus_census)
tx_county_cols_new <- colnames(tx_county_info)
tx_statewide_cols_new <- colnames(tx_statewide_info)
# Get sets for columns
print(as.set(mobility_cols_orig) - as.set(mobility_cols_new))
print(as.set(census_cols_orig) - as.set(census_cols_new))
print(as.set(tx_county_cols_orig) - as.set(tx_county_cols_new))
print(as.set(tx_county_cols_orig) - as.set(tx_statewide_cols_new))
When removing columns that either do not have data or a constant is repeated throughout the column, the global_mobility_report remains the same; however, the following columns are dropped for the respective dataframes:
head(tx_county_info)
mobility_duplicates <- duplicated(global_mobility_report)
census_duplicates <- duplicated(cases_plus_census)
tx_county_duplicates <- duplicated(tx_county_info)
tx_statewide_duplicates <- duplicated(tx_statewide_info)
print(sum(mobility_duplicates, na.rm = TRUE))
print(sum(census_duplicates, na.rm = TRUE))
print(sum(tx_county_duplicates, na.rm = TRUE))
print(sum(tx_statewide_duplicates, na.rm = TRUE))
We can see that there were no duplicate observations in the data. At least none on the basis that the data was the exact same.
unique(mobility_duplicates)
# make a pct infected column (virus so should get 1 time, but this is disputed for covid)
subset_census['pct_infected'] <- subset_census['confirmed_cases']/subset_census['total_pop']
subset_census['pct_deaths'] <- subset_census['deaths']/subset_census['total_pop']
head(subset_census)
library(RColorBrewer)
plot_vs_county <- function(df, col_val, percentile=FALSE,
fips_title="county_fips_code", banks=6,
legend_title="", graphic_title=""){
# Subset for speed
df <- df[c(fips_title, col_val)]
# Get county data
gcounty <- ggplot2::map_data("county")
# USA map data
gusa <- map_data("state")
if (banks > 9){
mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(banks)
}
# Format with subregions
fipstab <-
transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
unique() %>%
separate(county, c("region", "subregion"), sep = ",")
# Combine in desired order (NA for missing)
gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))
dis <- df
dis$rprop <- rank(df[col_val])
dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
include.lowest = TRUE)
# Missing data
anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
select(region, subregion) %>%
unique()
gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
fill_vals <- gcounty_pop[col_val]
# Plot
if (legend_title == ""){
legend_title <- col_val
}
if (percentile == FALSE){
# names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
color = "grey", size = 0.1, name="Percent Infected") +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
scale_fill_gradient2()
# scale_fill_gradient(low = "white", high = "red", na.value = "grey")
# scale_fill_gradientn(colours = terrain.colors(10))
}
if (percentile == TRUE){
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = pcls),
color = "grey", size = 0.1) +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
scale_fill_manual(values = mycolors, na.value = "grey") +
# scale_fill_brewer(palette = "viridis", na.value = "grey") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
legend.background = element_rect(fill = NA),
legend.position = "left")
}
plt <- plt + labs(fill=legend_title) + ggtitle(graphic_title)
plt
}
plot_vs_county(subset_census, "pct_infected", legend_title = "Percent Infected")
Ignoring unknown parameters: name
plot_vs_county(subset_census, "pct_infected", percentile = TRUE, banks=11,
legend_title = "Percentile Infected",
graphic_title = "Percentile of Percentage of People Infected by County")
library(maps)
census_infected <- subset_census[c("county_fips_code", "pct_infected")]
# census_infected$color_density <- rainbow(n = 50, census_infected$pct_infected / max(census_infected$pct_infected))
census_infected$color_density <- heat.colors(census_infected$pct_infected/max(census_infected$pct_infected))
Error in `$<-.data.frame`(`*tmp*`, color_density, value = character(0)) :
replacement has 0 rows, data has 3142
data(county.fips)
## Set up fake df_pop_county data frame
df_pop_county <- data.frame(region=county.fips$fips)
df_pop_county$value <- county.fips$fips
y <- df_pop_county$value
df_pop_county$color <- gray(y / max(y))
## merge population data with county.fips to make sure color column is
## ordered correctly.
counties <- county.fips %>% left_join(df_pop_county, by=c('fips'='region'))
# print(head(counties))
newdata <- census_infected[order(census_infected$county_fips_code),]
newdata
# Through away counties that are not in default
new_data <- newdata[(newdata$county_fips_code %in% counties$fips),]
new_data
data(county.fips)
## Set up fake df_pop_county data frame
df_pop_county <- data.frame(region=county.fips$fips)
df_pop_county$value <- county.fips$fips
y <- df_pop_county$value
df_pop_county$color <- gray(y / max(y))
## merge population data with county.fips to make sure color column is
## ordered correctly.
counties <- county.fips %>% left_join(df_pop_county, by=c('fips'='region'))
Error in county.fips %>% left_join(df_pop_county, by = c(fips = "region")) :
could not find function "%>%"
census_infected
data(county.fips)
df_pop_county <- data.frame(region=county.fips$fips)
df_pop_county
plot_vs_county <- function(df, col_val, percentile=FALSE, fips_title="county_fips_code", banks=6, legend_title=""){
# Subset for speed
df <- df[c(fips_title, col_val)]
# Get county data
gcounty <- ggplot2::map_data("county")
# USA map data
gusa <- map_data("state")
# Format with subregions
fipstab <-
transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
unique() %>%
separate(county, c("region", "subregion"), sep = ",")
# Combine in desired order (NA for missing)
gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))
dis <- df
dis$rprop <- rank(df[col_val])
dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
include.lowest = TRUE)
# Missing data
anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
select(region, subregion) %>%
unique()
gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
fill_vals <- gcounty_pop[col_val]
# Plot
if (legend_title == ""){
legend_title <- col_val
}
if (percentile == FALSE){
# names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
color = "grey", size = 0.1, name="Percent Infected") +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
}
if (percentile == TRUE){
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = pcls),
color = "grey", size = 0.1) +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
scale_fill_brewer(palette = "Reds", na.value = "blue") +
theme(legend.background = element_rect(fill = NA))
}
plt <- plt + labs(fill=legend_title)
plt
}
ggplot2::map_data("county")
as_tibble(maps::county.fips)
# dall <- ggplot2::map_data("county") %>% left_join(as_tibble(maps::county.fips))
install.packages("mapproj")
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.0/mapproj_1.2.7.tgz'
Content type 'application/x-gzip' length 83240 bytes (81 KB)
==================================================
downloaded 81 KB
The downloaded binary packages are in
/var/folders/2t/zk2m3vcj2_51x1r2p3cbnt9r0000gn/T//RtmpficDEN/downloaded_packages
# http://homepage.stat.uiowa.edu/~luke/classes/STAT4580-2020/maps.html
# Choropleth Maps of County Population
library(ggplot2)
# install.packages(mapproj)
library(mapproj)
gcounty <- ggplot2::map_data("county")
ggplot(gcounty) +
geom_polygon(aes(long, lat, group = group),
fill = NA, color = "black", size = 0.05) +
coord_map("bonne", parameters = 41.6)
head(filter(maps::county.fips, grepl(":", polyname)))
fipstab <-
transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
unique() %>%
separate(county, c("region", "subregion"), sep = ",")
head(fipstab)
gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))
head(gcounty)
test <- gcounty %>% left_join(census_infected, by=c('fips'='county_fips_code'))
test
install.packages("ggthemes")
trying URL 'https://cran.rstudio.com/bin/macosx/contrib/4.0/ggthemes_4.2.4.tgz'
Content type 'application/x-gzip' length 436743 bytes (426 KB)
==================================================
downloaded 426 KB
The downloaded binary packages are in
/var/folders/2t/zk2m3vcj2_51x1r2p3cbnt9r0000gn/T//RtmpficDEN/downloaded_packages
library(ggthemes)
census_infected
census_infected %>% rank(pct_infected)
Error in rank(., pct_infected) : object 'pct_infected' not found
ncls <- 6
w <- census_infected %>% select(fips = county_fips_code, pct_infected) %>% mutate(rpop = rank(pct_infected),
pcls = cut(100 * percent_rank(pct_infected), seq(0, 100, len = ncls),
include.lowest = TRUE))
w
anti_join(gcounty, w, "fips") %>%
select(region, subregion) %>%
unique()
gcounty_pop <- left_join(gcounty, w, "fips")
# filter(gcounty_pop, is.na(rpop)) %>%
# select(region, subregion, pop, rpop, pcls) %>%
# unique()
gcounty_pop
gusa <- map_data("state")
ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = pct_infected),
color = "grey", size = 0.1, name="Percent Infected") +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
Ignoring unknown parameters: name
ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = pcls),
color = "grey", size = 0.1) +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
scale_fill_brewer(palette = "Reds", na.value = "blue",
name = "Percentile") +
theme(legend.background = element_rect(fill = NA))
plot_vs_county <- function(df, col_val, percentile=FALSE, fips_title="county_fips_code", banks=6, legend_title=""){
# Subset for speed
df <- df[c(fips_title, col_val)]
# Get county data
gcounty <- ggplot2::map_data("county")
# USA map data
gusa <- map_data("state")
# Format with subregions
fipstab <-
transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
unique() %>%
separate(county, c("region", "subregion"), sep = ",")
# Combine in desired order (NA for missing)
gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))
dis <- df
dis$rprop <- rank(df[col_val])
dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
include.lowest = TRUE)
# Missing data
anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
select(region, subregion) %>%
unique()
gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
fill_vals <- gcounty_pop[col_val]
# Plot
if (legend_title == ""){
legend_title <- col_val
}
if (percentile == FALSE){
# names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
color = "grey", size = 0.1, name="Percent Infected") +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
}
if (percentile == TRUE){
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = pcls),
color = "grey", size = 0.1) +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
scale_fill_brewer(palette = "Reds", na.value = "blue") +
theme(legend.background = element_rect(fill = NA))
}
plt <- plt + labs(fill=legend_title)
plt
}
subset_census
# plot_vs_county(subset_census, "pct_infected", legend_title = "Pecent Infected")
plot_vs_county(subset_census, "pct_infected", percentile = TRUE, legend_title = "Percentile Infected", banks = 10)